In [1]:
import graphlab

In [2]:
# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1477284409.log
This non-commercial license of GraphLab Create for academic use is assigned to sudhanshu.shekhar.iitd@gmail.com and will expire on September 18, 2017.

Load the image dataset


In [3]:
image_train = graphlab.SFrame('image_train_data/')

In [4]:
image_test = graphlab.SFrame('image_test_data/')

In [5]:
image_train.head()


Out[5]:
id image label deep_features image_array
24 Height: 32 Width: 32 bird [0.242871761322,
1.09545373917, 0.0, ...
[73.0, 77.0, 58.0, 71.0,
68.0, 50.0, 77.0, 69.0, ...
33 Height: 32 Width: 32 cat [0.525087952614, 0.0,
0.0, 0.0, 0.0, 0.0, ...
[7.0, 5.0, 8.0, 7.0, 5.0,
8.0, 5.0, 4.0, 6.0, 7.0, ...
36 Height: 32 Width: 32 cat [0.566015958786, 0.0,
0.0, 0.0, 0.0, 0.0, ...
[169.0, 122.0, 65.0,
131.0, 108.0, 75.0, ...
70 Height: 32 Width: 32 dog [1.12979578972, 0.0, 0.0,
0.778194487095, 0.0, ...
[154.0, 179.0, 152.0,
159.0, 183.0, 157.0, ...
90 Height: 32 Width: 32 bird [1.71786928177, 0.0, 0.0,
0.0, 0.0, 0.0, ...
[216.0, 195.0, 180.0,
201.0, 178.0, 160.0, ...
97 Height: 32 Width: 32 automobile [1.57818555832, 0.0, 0.0,
0.0, 0.0, 0.0, ...
[33.0, 44.0, 27.0, 29.0,
44.0, 31.0, 32.0, 45.0, ...
107 Height: 32 Width: 32 dog [0.0, 0.0,
0.220677852631, 0.0, ...
[97.0, 51.0, 31.0, 104.0,
58.0, 38.0, 107.0, 61.0, ...
121 Height: 32 Width: 32 bird [0.0, 0.23753464222, 0.0,
0.0, 0.0, 0.0, ...
[93.0, 96.0, 88.0, 102.0,
106.0, 97.0, 117.0, ...
136 Height: 32 Width: 32 automobile [0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 7.5737862587, 0.0, ...
[35.0, 59.0, 53.0, 36.0,
56.0, 56.0, 42.0, 62.0, ...
138 Height: 32 Width: 32 bird [0.658935725689, 0.0,
0.0, 0.0, 0.0, 0.0, ...
[205.0, 193.0, 195.0,
200.0, 187.0, 193.0, ...
[10 rows x 5 columns]

Computing summary statistics of the data

Using the training data, compute the sketch summary of the ‘label’ column and interpret the results. What’s the least common category in the training data? Save this result to answer the quiz at the end.


In [6]:
image_train['label'].sketch_summary()


Out[6]:
+------------------+-------+----------+
|       item       | value | is exact |
+------------------+-------+----------+
|      Length      |  2005 |   Yes    |
| # Missing Values |   0   |   Yes    |
| # unique values  |   4   |    No    |
+------------------+-------+----------+

Most frequent items:
+-------+------------+-----+-----+------+
| value | automobile | cat | dog | bird |
+-------+------------+-----+-----+------+
| count |    509     | 509 | 509 | 478  |
+-------+------------+-----+-----+------+

Creating category-specific image retrieval models


In [7]:
label_filter = lambda l : image_train[image_train['label'] == l]

In [8]:
image_train_auto = label_filter('automobile')
len(image_train_auto)


Out[8]:
509

In [9]:
image_train_cat = label_filter('cat')
len(image_train_cat)


Out[9]:
509

In [10]:
image_train_dog = label_filter('dog')
len(image_train_dog)


Out[10]:
509

In [11]:
image_train_bird = label_filter('bird')
len(image_train_bird)


Out[11]:
478

In [12]:
auto_model = graphlab.nearest_neighbors.create(image_train_auto, features=['deep_features'], label='id')
cat_model = graphlab.nearest_neighbors.create(image_train_cat, features=['deep_features'], label='id')
dog_model = graphlab.nearest_neighbors.create(image_train_dog, features=['deep_features'], label='id')
bird_model = graphlab.nearest_neighbors.create(image_train_bird, features=['deep_features'], label='id')


Starting brute force nearest neighbors model training.
Starting brute force nearest neighbors model training.
Starting brute force nearest neighbors model training.
Starting brute force nearest neighbors model training.

In [13]:
def get_images_from_ids(query_result):
    return image_train.filter_by(query_result['reference_label'], 'id')

In [14]:
show_neighbours = lambda i : get_images_from_ids(knn_model.query(image_train[i:i+1]))['image'].show()

In [15]:
image_test[0:1]['image'].show()


Canvas is accessible via web browser at the URL: http://localhost:50395/index.html
Opening Canvas in default web browser.

In [16]:
graphlab.canvas.set_target('ipynb')

In [17]:
image_test[0:1]['image'].show()


What is the nearest ‘cat’ labeled image in the training data to the cat image above (the first image in the test data)? Save this result.


In [71]:
cat_model.query(image_test[0:1])


Starting pairwise querying.
+--------------+---------+-------------+--------------+
| Query points | # Pairs | % Complete. | Elapsed Time |
+--------------+---------+-------------+--------------+
| 0            | 1       | 0.196464    | 13.066ms     |
| Done         |         | 100         | 74.237ms     |
+--------------+---------+-------------+--------------+
Out[71]:
query_label reference_label distance rank
0 16289 34.623719208 1
0 45646 36.0068799284 2
0 32139 36.5200813436 3
0 25713 36.7548502521 4
0 331 36.8731228168 5
[5 rows x 4 columns]


In [72]:
image_train_cat[image_train_cat['id'] == 16289]['image'].show()


What is the nearest ‘dog’ labeled image in the training data to the cat image above (the first image in the test data)? Save this result.


In [19]:
dog_model.query(image_test[0:1])


Starting pairwise querying.
+--------------+---------+-------------+--------------+
| Query points | # Pairs | % Complete. | Elapsed Time |
+--------------+---------+-------------+--------------+
| 0            | 1       | 0.196464    | 9.953ms      |
| Done         |         | 100         | 97.588ms     |
+--------------+---------+-------------+--------------+
Out[19]:
query_label reference_label distance rank
0 16976 37.4642628784 1
0 13387 37.5666832169 2
0 35867 37.6047267079 3
0 44603 37.7065585153 4
0 6094 38.5113254907 5
[5 rows x 4 columns]


In [73]:
image_train_dog[image_train_dog['id'] == 16976]['image'].show()


A simple example of nearest-neighbors classification

For the first image in the test data (image_test[0:1]), which we used above, compute the mean distance between this image at its 5 nearest neighbors that were labeled ‘cat’ in the training data (similarly to what you did in the previous question). Save this result.


In [20]:
cat_model.query(image_test[0:1])['distance'].mean()


Starting pairwise querying.
+--------------+---------+-------------+--------------+
| Query points | # Pairs | % Complete. | Elapsed Time |
+--------------+---------+-------------+--------------+
| 0            | 1       | 0.196464    | 12.034ms     |
| Done         |         | 100         | 53.609ms     |
+--------------+---------+-------------+--------------+
Out[20]:
36.15573070978294

Similarly, for the first image in the test data (image_test[0:1]), which we used above, compute the mean distance between this image at its 5 nearest neighbors that were labeled ‘dog’ in the training data (similarly to what you did in the previous question). Save this result.


In [21]:
dog_model.query(image_test[0:1])['distance'].mean()


Starting pairwise querying.
+--------------+---------+-------------+--------------+
| Query points | # Pairs | % Complete. | Elapsed Time |
+--------------+---------+-------------+--------------+
| 0            | 1       | 0.196464    | 9.475ms      |
| Done         |         | 100         | 46.569ms     |
+--------------+---------+-------------+--------------+
Out[21]:
37.77071136184157

[Challenging Question] Computing nearest neighbors accuracy using SFrame operations


In [22]:
label_filter_test = lambda l : image_test[image_test['label'] == l]
image_test_cat = label_filter_test('cat')
print len(image_test_cat)
image_test_dog = label_filter_test('dog')
print len(image_test_dog)
image_test_bird = label_filter_test('bird')
print len(image_test_bird)
image_test_automobile = label_filter_test('automobile')
print len(image_test_automobile)


1000
1000
1000
1000

In [23]:
print len(image_test)


4000

Finding nearest neighbors in the training set for each part of the test set


In [24]:
dog_dog_neighbors = dog_model.query(image_test_dog, k = 1)
dog_cat_neighbors = cat_model.query(image_test_dog, k = 1)
dog_automobile_neighbors = auto_model.query(image_test_dog, k = 1)
dog_bird_neighbors = bird_model.query(image_test_dog, k = 1)


Starting blockwise querying.
max rows per data block: 4348
number of reference data blocks: 8
number of query data blocks: 1
+--------------+---------+-------------+--------------+
| Query points | # Pairs | % Complete. | Elapsed Time |
+--------------+---------+-------------+--------------+
| 1000         | 63000   | 12.3772     | 297.635ms    |
| Done         | 509000  | 100         | 340.52ms     |
+--------------+---------+-------------+--------------+
Starting blockwise querying.
max rows per data block: 4348
number of reference data blocks: 8
number of query data blocks: 1
+--------------+---------+-------------+--------------+
| Query points | # Pairs | % Complete. | Elapsed Time |
+--------------+---------+-------------+--------------+
| 1000         | 63000   | 12.3772     | 281.573ms    |
| Done         | 509000  | 100         | 331.25ms     |
+--------------+---------+-------------+--------------+
Starting blockwise querying.
max rows per data block: 4348
number of reference data blocks: 8
number of query data blocks: 1
+--------------+---------+-------------+--------------+
| Query points | # Pairs | % Complete. | Elapsed Time |
+--------------+---------+-------------+--------------+
| 1000         | 64000   | 12.5737     | 276.817ms    |
| Done         | 509000  | 100         | 376.33ms     |
+--------------+---------+-------------+--------------+
Starting blockwise querying.
max rows per data block: 4348
number of reference data blocks: 8
number of query data blocks: 1
+--------------+---------+-------------+--------------+
| Query points | # Pairs | % Complete. | Elapsed Time |
+--------------+---------+-------------+--------------+
| 1000         | 60000   | 12.5523     | 287.251ms    |
| Done         | 478000  | 100         | 386.24ms     |
+--------------+---------+-------------+--------------+

In [26]:
dog_distances = graphlab.SFrame({
        'dog-dog' : dog_dog_neighbors['distance'],
        'dog-cat' : dog_cat_neighbors['distance'],
        'dog-bird': dog_bird_neighbors['distance'],
        'dog-automobile': dog_automobile_neighbors['distance']
    })

In [27]:
dog_distances.head()


Out[27]:
dog-automobile dog-bird dog-cat dog-dog
41.9579761457 41.7538647304 36.4196077068 33.4773590373
46.0021331807 41.3382958925 38.8353268874 32.8458495684
42.9462290692 38.6157590853 36.9763410854 35.0397073189
41.6866060048 37.0892269954 34.5750072914 33.9010327697
39.2269664935 38.272288694 34.778824791 37.4849250909
40.5845117698 39.1462089236 35.1171578292 34.945165344
45.1067352961 40.523040106 40.6095830913 39.0957278345
41.3221140974 38.1947918393 39.9036867306 37.7696131032
41.8244654995 40.1567131661 38.0674700168 35.1089144603
45.4976929401 45.5597962603 42.7258732951 43.2422832585
[10 rows x 4 columns]

Computing the number of correct predictions using 1-nearest neighbors for the dog class


In [54]:
dog_distances[0:1]['dog-dog']


Out[54]:
dtype: float
Rows: 1
[33.47735903726335]

In [65]:
def is_dog_correct(r):
    return r['dog-dog'] < r['dog-cat'] and r['dog-dog'] < r['dog-bird'] and r['dog-dog'] < r['dog-automobile']

In [66]:
is_dog_correct(dog_distances[0:1])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-66-25d8b0e0c7a5> in <module>()
----> 1 is_dog_correct(dog_distances[0:1])

<ipython-input-65-608785539821> in is_dog_correct(r)
      1 def is_dog_correct(r):
----> 2     return r['dog-dog'] < r['dog-cat'] and r['dog-dog'] < r['dog-bird'] and r['dog-dog'] < r['dog-automobile']

/Users/sud/anaconda3/envs/gl-env/lib/python2.7/site-packages/graphlab/data_structures/sarray.pyc in __nonzero__(self)
    752         """
    753         # message copied from Numpy
--> 754         raise ValueError("The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()")
    755 
    756     def __bool__(self):

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Accuracy of predicting dog in the test data:

Using the work you did in this question, what is the accuracy of the 1-nearest neighbor classifier at classifying ‘dog’ images from the test set? Save this result to answer the quiz at the end.


In [68]:
dog_distances.apply(is_dog_correct).sum() / float(len(image_test_dog))


Out[68]:
0.678

In [62]:
dog_distances


Out[62]:
dog-automobile dog-bird dog-cat dog-dog
41.9579761457 41.7538647304 36.4196077068 33.4773590373
46.0021331807 41.3382958925 38.8353268874 32.8458495684
42.9462290692 38.6157590853 36.9763410854 35.0397073189
41.6866060048 37.0892269954 34.5750072914 33.9010327697
39.2269664935 38.272288694 34.778824791 37.4849250909
40.5845117698 39.1462089236 35.1171578292 34.945165344
45.1067352961 40.523040106 40.6095830913 39.0957278345
41.3221140974 38.1947918393 39.9036867306 37.7696131032
41.8244654995 40.1567131661 38.0674700168 35.1089144603
45.4976929401 45.5597962603 42.7258732951 43.2422832585
[1000 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [ ]: